import pandas as pd
data=pd.read_excel('Capstone_final_dataset.xlsx')
data.head()


data.shape

(5822, 32)


data.isnull().sum()


data.drop('a1c_cat', axis=1, inplace=True)


data.head()


data.isnull().sum()


import seaborn as sns
import matplotlib.pyplot as plt


# Check unique values in 'first_tx'
print(data['first_tx'].unique())

['ENZALUTA' 'ABIRATER']


# Mapping categorical values to numeric
data['first_tx'] = data['first_tx'].map({'ENZALUTA': 1, 'ABIRATER': 0})
data.head()


# Check unique values in 'race'
print(data['race'].unique())

['White' 'Black' 'Other' 'Unknown']


# Mapping categorical values to numeric
data['race'] = data['race'].map({'White': 1, 'Black' : 0, 'Other': 3, 'Unknown': 4})
data.head()


# Check unique values in 'sex'
print(data['SEX'].unique())

['M']


data.drop('SEX', axis=1, inplace=True)


data.head()


# Check unique values in 'fi_score_cat'
print(data['fi_score_cat'].unique())

['2. pre-f' '1. non-f' '3. mild' '4. moder' '5. sever']


# Mapping categorical values to numeric
data['fi_score_cat'] = data['fi_score_cat'].map({'1. non-f': 0, '2. pre-f' : 1, '3. mild': 2, '4. moder': 3,'5. sever':5})
data.head()


# Check unique values in 'psa_cat'
print(data['psa_cat'].unique())

['Cat5. 50' 'Cat2. 4' 'Cat3. 10' 'Cat4. 20' 'Cat7. 20' 'Cat1. 0'
 'Cat6. 10' 'Unknown']


# Mapping categorical values to numeric
data['psa_cat'] = data['psa_cat'].map({'Cat5. 50':5, 'Cat2. 4':2, 'Cat3. 10':3, 'Cat4. 20':4, 'Cat7. 20':7, 'Cat1. 0':1,
 'Cat6. 10':6, 'Unknown':0})
data.head()


import pandas as pd

# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)


data.isnull().sum()


# Replace empty strings with NaN
import numpy as np
data = data.replace(' ', np.nan)

<ipython-input-76-5a78547b3815>:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data = data.replace(' ', np.nan)


import pandas as pd

# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)


# Check unique values in 'gleason_reviewed'
print(data['gleason_reviewed'].unique())

[ 9.  7.  0.  8.  6. 10.  5.  2.  4.  3.]


# Calculate the correlation matrix
correlation_matrix = data.corr()

# Create a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


import matplotlib.pyplot as plt
import seaborn as sns
# Set the aesthetics for the plots
sns.set(style='whitegrid')

# List of all columns in the DataFrame excluding 'death'
columns_to_visualize = data.columns.drop('death')

# Create a figure to visualize the relationship with 'death'
plt.figure(figsize=(20, 25))

# Loop through each column and create plots
for i, col in enumerate(columns_to_visualize):
    plt.subplot(6, 6, i + 1)  # Adjust the number of rows and columns for the grid
    if data[col].nunique() < 20:  # Categorical variable
        sns.countplot(x='death', hue=col, data=data, palette='viridis')
        plt.title(f'Death vs {col}')
    else:  # Numeric variable
        sns.boxplot(x='death', y=col, data=data, palette='viridis')
        plt.title(f'Death vs {col}')
    plt.xlabel('Death (1 = Yes, 0 = No)')
    plt.ylabel(col)

plt.tight_layout()
plt.show()

<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')


import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(data, figsize=(50, 50))
plt.show()


print(data.columns)

Index(['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'death', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed'],
      dtype='object')


import matplotlib.pyplot as plt

# Grouping the data by the features and calculating the mean of 'death'
enza_death = data.groupby('enza')['death'].mean()
abiraterone_death = data.groupby('ABIRATERONE')['death'].mean()
psa_cat_death = data.groupby('psa_cat')['death'].mean()

# Plotting death vs. enza
plt.figure(figsize=(12, 6))
plt.plot(enza_death.index, enza_death.values, marker='o', label='Death vs Enza')
plt.title('Line Graph: Death vs Enza')
plt.xlabel('Enza (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()

# Plotting death vs. ABIRATERONE
plt.figure(figsize=(12, 6))
plt.plot(abiraterone_death.index, abiraterone_death.values, marker='o', color='orange', label='Death vs ABIRATERONE')
plt.title('Line Graph: Death vs ABIRATERONE')
plt.xlabel('ABIRATERONE (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()

# Plotting death vs. psa_cat
plt.figure(figsize=(12, 6))
plt.plot(psa_cat_death.index, psa_cat_death.values, marker='o', color='green', label='Death vs PSA Category')
plt.title('Line Graph: Death vs PSA Category')
plt.xlabel('PSA Category')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()


#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

#Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']]
y = data['death']  # Target variable

#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

#Fitting a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

#Making predictions on the test set
y_pred = model.predict(X_test)
# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 0.9742323384152888
Test Accuracy: 0.967381974248927


# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']]  # Selecting features
y = data['death']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Fitting a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # You can increase max_iter if convergence issues arise
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Evaluating the model
test_accuracy = accuracy_score(y_test, y_pred_test)
train_accuracy = accuracy_score(y_train, y_pred_train)

# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Confusion matrix for the test data
conf_matrix = confusion_matrix(y_test, y_pred_test)
classification_rep = classification_report(y_test, y_pred_test)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Training Accuracy: 0.8937083959630664
Test Accuracy: 0.8755364806866953
Confusion Matrix:
 [[186  86]
 [ 59 834]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       272
           1       0.91      0.93      0.92       893

    accuracy                           0.88      1165
   macro avg       0.83      0.81      0.82      1165
weighted avg       0.87      0.88      0.87      1165

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize a StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit logistic regression with scaled data
log_reg_model = LogisticRegression(max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)

# Cross-validation
cross_val_scores = cross_val_score(log_reg_model, X_train_scaled, y_train, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Mean cross-validation score:", cross_val_scores.mean())

# Predictions on test data
y_pred = log_reg_model.predict(X_test_scaled)

# Evaluate accuracy
train_accuracy = log_reg_model.score(X_train_scaled, y_train)
test_accuracy = log_reg_model.score(X_test_scaled, y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Cross-validation scores: [0.97532189 0.972103   0.97099893 0.97851772 0.97207304]
Mean cross-validation score: 0.973802916242169
Training Accuracy: 0.9742323384152888
Test Accuracy: 0.9682403433476395


log_reg_model = LogisticRegression(solver='saga', max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=10000000, solver='saga')

LogisticRegression(max_iter=10000000, solver='saga')


from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Cross-validation scores: [0.8832618  0.89613734 0.88745704 0.89690722 0.89261168]
Mean cross-validation score: 0.8912750173296168

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


from sklearn.ensemble import RandomForestClassifier

# Initialize and fit a Random Forest model
rf_model = RandomForestClassifier()
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5)
print(f'Cross-validation scores for Random Forest: {cv_scores_rf}')
print(f'Mean cross-validation score for Random Forest: {cv_scores_rf.mean()}')

Cross-validation scores for Random Forest: [0.9751073  0.97339056 0.97164948 0.97508591 0.96563574]
Mean cross-validation score for Random Forest: 0.9721737976195743


from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation score: 0.9730324616904857


# Train the final Random Forest model with the best parameters
rf_best = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

# Fit the model on the training data
rf_best.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_best.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Test Accuracy: {test_accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Test Accuracy: 0.967381974248927
Confusion Matrix:
[[270   2]
 [ 36 857]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       272
           1       1.00      0.96      0.98       893

    accuracy                           0.97      1165
   macro avg       0.94      0.98      0.96      1165
weighted avg       0.97      0.97      0.97      1165


#PREDICTING THE MODEL
import pandas as pd
import numpy as np


# Select a specific row of data (e.g., the first row)
input_data = data.iloc[0, :29].values

# Convert the selected row to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array for prediction (only one instance)
data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make the prediction using the model
prediction = model.predict(data_reshaped)

# Output the prediction result
if prediction[0] == 0:
    print('The Person did not die due to prostate cancer')
else:
    print('The Person has died due to prostate cancer')

The Person has died due to prostate cancer

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(


# Importing the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming 'data' is the DataFrame that contains the dataset, and 'death' is the target variable
X = data.drop(columns=['death'])
y = data['death']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Standardizing the features for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. K-Nearest Neighbors (KNN) Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Predictions and Evaluation for KNN
knn_predictions = knn_model.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_conf_matrix = confusion_matrix(y_test, knn_predictions)
knn_class_report = classification_report(y_test, knn_predictions)

# Calculate training accuracy
train_predictions = knn_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, knn_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("KNN Model Evaluation")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", knn_conf_matrix)
print("Classification Report:\n", knn_class_report)

Training Accuracy: 0.9233412067854843
Test Accuracy: 0.8721030042918455
KNN Model Evaluation
Accuracy: 0.8721030042918455
Confusion Matrix:
 [[156 116]
 [ 33 860]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.57      0.68       272
           1       0.88      0.96      0.92       893

    accuracy                           0.87      1165
   macro avg       0.85      0.77      0.80      1165
weighted avg       0.87      0.87      0.86      1165


# 2. Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions and Evaluation for Naive Bayes
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_conf_matrix = confusion_matrix(y_test, nb_predictions)
nb_class_report = classification_report(y_test, nb_predictions)

# Calculate training accuracy
train_predictions = nb_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, nb_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nNaive Bayes Model Evaluation")
print("Accuracy:", nb_accuracy)
print("Confusion Matrix:\n", nb_conf_matrix)
print("Classification Report:\n", nb_class_report)

Training Accuracy: 0.789778827571398
Test Accuracy: 0.9622317596566523

Naive Bayes Model Evaluation
Accuracy: 0.9622317596566523
Confusion Matrix:
 [[260  12]
 [ 32 861]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       272
           1       0.99      0.96      0.98       893

    accuracy                           0.96      1165
   macro avg       0.94      0.96      0.95      1165
weighted avg       0.96      0.96      0.96      1165

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names
  warnings.warn(


# Importing the necessary libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


X = data.drop(columns=['death'])
y = data['death']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Standardizing the features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM Model with a linear kernel
svm_model = SVC(kernel='linear', C=1.0, random_state=50)
svm_model.fit(X_train_scaled, y_train)

# Predictions and Evaluation for SVM
svm_predictions = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)
svm_class_report = classification_report(y_test, svm_predictions)

# Calculate training accuracy
train_predictions = svm_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, svm_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("SVM Model Evaluation")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:\n", svm_conf_matrix)
print("Classification Report:\n", svm_class_report)

Training Accuracy: 0.9740176079020829
Test Accuracy: 0.9682403433476395
SVM Model Evaluation
Accuracy: 0.9682403433476395
Confusion Matrix:
 [[272   0]
 [ 37 856]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94       272
           1       1.00      0.96      0.98       893

    accuracy                           0.97      1165
   macro avg       0.94      0.98      0.96      1165
weighted avg       0.97      0.97      0.97      1165


#@title Convert ipynb to HTML in Colab
# Upload ipynb
from google.colab import files
f = files.upload()

# Convert ipynb to html
import subprocess
file0 = list(f.keys())[0]
_ = subprocess.run(["pip", "install", "nbconvert"])
_ = subprocess.run(["jupyter", "nbconvert", file0, "--to", "html"])

# download the html
files.download(file0[:-5]+"html")

Saving capstone (1).ipynb to capstone (1).ipynb

	time0_year	first_tx	race	SEX	age	above74	age_cat	ABIRATERONE	enza	...	a1c_cat	cindex_b_Romano	char_cat	Total_Elixhauser_Groups	fi_score_cat	dis_number	fi_score	frail	gleason_reviewed
0	2017	ENZALUTA	White	M	70	0	3	0	1	...	1. <5.6	2	1	3	2. pre-f	4	0.129032	0	9
1	2015	ABIRATER	White	M	68	0	2	1	0	...	1. <5.6	4	2	7	1. non-f	3	0.096774	0	7
2	2015	ABIRATER	White	M	71	0	3	1	0	...	1. <5.6	3	1	5	1. non-f	3	0.096774	0
3	2015	ABIRATER	White	M	79	1	3	1	0	...	4. >=7.2	7	3	10	3. mild	9	0.290323	1	9
4	2016	ABIRATER	White	M	90	1	5	1	0	...	NaN	3	1	6	4. moder	10	0.322581	1

	0
time0_year	0
first_tx	0
race	0
black	0
SEX	0
age	0
above74	0
age_cat	0
ABIRATERONE	0
enza	0
first_tx_last_day_Supply	0
days_bt_first_last_prescrib	0
first_tx_daysSupply_sum	0
fu_end_date_year	0
death	0
crcl_cat	0
albumin_cat	0
bilirubin_cat	0
hgb_cat	0
psa_cat	0
PSACAT	0
BMI_cat	0
a1c_cat	2401
cindex_b_Romano	0
char_cat	0
Total_Elixhauser_Groups	0
DOCETAXEL_bf_time0	0
fi_score_cat	0
dis_number	0
fi_score	0
frail	0
gleason_reviewed	0